import pandas as pd
import numpy as np
import scipy as sy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.units as munits
import matplotlib as mpl
from scipy.special import ndtri
import matplotlib.dates as mdates
import plotly.graph_objects as go
import plotly.express as px
import warnings
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
#creating a dataframe with timestamp
t_index = pd.DatetimeIndex(pd.date_range(start='2019-10-10', end='2020-06-06 23:00:00', freq="1h"))
df = pd.DataFrame(t_index, columns=['hour'])
df
| hour | |
|---|---|
| 0 | 2019-10-10 00:00:00 |
| 1 | 2019-10-10 01:00:00 |
| 2 | 2019-10-10 02:00:00 |
| 3 | 2019-10-10 03:00:00 |
| 4 | 2019-10-10 04:00:00 |
| ... | ... |
| 5779 | 2020-06-06 19:00:00 |
| 5780 | 2020-06-06 20:00:00 |
| 5781 | 2020-06-06 21:00:00 |
| 5782 | 2020-06-06 22:00:00 |
| 5783 | 2020-06-06 23:00:00 |
5784 rows × 1 columns
# Import data using datetime and no data value
hourly = pd.read_csv('Book1.csv')
hourly
| Biogas_yield | Methane | hour | |
|---|---|---|---|
| 0 | 24.369362 | 57.197067 | 10/10/2019 0:00 |
| 1 | 23.942646 | 57.182819 | 10/10/2019 1:00 |
| 2 | 24.668522 | 57.191166 | 10/10/2019 2:00 |
| 3 | 23.575047 | 57.181984 | 10/10/2019 3:00 |
| 4 | 22.973997 | 57.180664 | 10/10/2019 4:00 |
| ... | ... | ... | ... |
| 5779 | 13.000000 | NaN | 6/06/2020 19:00 |
| 5780 | 14.000000 | NaN | 6/06/2020 20:00 |
| 5781 | 14.000000 | NaN | 6/06/2020 21:00 |
| 5782 | 13.000000 | NaN | 6/06/2020 22:00 |
| 5783 | 13.000000 | NaN | 6/06/2020 23:00 |
5784 rows × 3 columns
fig = px.line(hourly, x=hourly.hour, y=hourly.Biogas_yield, title="Daily biogas production")
fig.show()
extracted_col = hourly["Biogas_yield"]
extracted_col
0 24.369362
1 23.942646
2 24.668522
3 23.575047
4 22.973997
...
5779 13.000000
5780 14.000000
5781 14.000000
5782 13.000000
5783 13.000000
Name: Biogas_yield, Length: 5784, dtype: float64
df1 = df.join(extracted_col)
df1
| hour | Biogas_yield | |
|---|---|---|
| 0 | 2019-10-10 00:00:00 | 24.369362 |
| 1 | 2019-10-10 01:00:00 | 23.942646 |
| 2 | 2019-10-10 02:00:00 | 24.668522 |
| 3 | 2019-10-10 03:00:00 | 23.575047 |
| 4 | 2019-10-10 04:00:00 | 22.973997 |
| ... | ... | ... |
| 5779 | 2020-06-06 19:00:00 | 13.000000 |
| 5780 | 2020-06-06 20:00:00 | 14.000000 |
| 5781 | 2020-06-06 21:00:00 | 14.000000 |
| 5782 | 2020-06-06 22:00:00 | 13.000000 |
| 5783 | 2020-06-06 23:00:00 | 13.000000 |
5784 rows × 2 columns
df1['hour'] = pd.to_datetime(df['hour'])
daily = df1.set_index('hour')
daily = df1.resample('D', on='hour').sum()
daily.columns
Index(['Biogas_yield'], dtype='object')
#Batch2 input:
#292.6 tonnes of RG (DM: 34.1 ± 8.4%)
#30.64 tonnes of inoculum (Biogas potential of inoculum is 30 m3/ton)
#Approximately 162 tonnes of residual grass from Batch1 (Biogas potential: 64 m3/ton)
daily
daily['Biogas_yield_FU'] = (daily['Biogas_yield'] / 485.24).round(2)
daily.to_csv('daily.csv', sep=',')
daily.describe()
| Biogas_yield | Biogas_yield_FU | |
|---|---|---|
| count | 241.000000 | 241.000000 |
| mean | 197.227960 | 0.406266 |
| std | 151.319908 | 0.311681 |
| min | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 |
| 50% | 304.000000 | 0.630000 |
| 75% | 316.000000 | 0.650000 |
| max | 339.000000 | 0.700000 |
df_latest = pd.read_csv('daily.csv', sep=',')
df_latest.describe()
| Biogas_yield | Biogas_yield_FU | |
|---|---|---|
| count | 241.000000 | 241.000000 |
| mean | 197.227960 | 0.610415 |
| std | 151.319908 | 0.468295 |
| min | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 |
| 50% | 304.000000 | 0.940000 |
| 75% | 316.000000 | 0.980000 |
| max | 339.000000 | 1.050000 |
fig = px.line(df_latest, x=df_latest.hour, y=df_latest.Biogas_yield, title="Daily biogas production")
fig.show()
#Batch2 input: 292.6 tonnes of RG (DM: 34.1 ± 8.4%)
#30.64 tonnes of inoculum
df_cumulative = daily['Biogas_yield'].cumsum()
df_updated = pd.DataFrame(df_cumulative, columns=['Biogas_yield'])
df_updated
df_updated['Biogas_yield_FU'] = (df_updated['Biogas_yield'] / 323.24).round(2)
df_updated.describe()
| Biogas_yield | Biogas_yield_FU | |
|---|---|---|
| count | 241.000000 | 241.000000 |
| mean | 15259.818143 | 47.210249 |
| std | 15774.461101 | 48.799595 |
| min | 334.938475 | 1.040000 |
| 25% | 334.938475 | 1.040000 |
| 50% | 10151.938475 | 31.410000 |
| 75% | 28806.938475 | 89.120000 |
| max | 47531.938475 | 147.050000 |
df_updated
df_updated.describe()
| Biogas_yield | Biogas_yield_FU | |
|---|---|---|
| count | 241.000000 | 241.000000 |
| mean | 15259.818143 | 47.210249 |
| std | 15774.461101 | 48.799595 |
| min | 334.938475 | 1.040000 |
| 25% | 334.938475 | 1.040000 |
| 50% | 10151.938475 | 31.410000 |
| 75% | 28806.938475 | 89.120000 |
| max | 47531.938475 | 147.050000 |
time_series = pd.read_csv('daily.csv')
time_series.describe()
| Biogas_yield | Biogas_yield_FU | |
|---|---|---|
| count | 241.000000 | 241.000000 |
| mean | 197.227960 | 0.610415 |
| std | 151.319908 | 0.468295 |
| min | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 |
| 50% | 304.000000 | 0.940000 |
| 75% | 316.000000 | 0.980000 |
| max | 339.000000 | 1.050000 |
Total = time_series["Biogas_yield_FU"]
Total_kde = Total.plot.kde(figsize=(20,10))
pd.DataFrame(Total).describe()
| Biogas_yield_FU | |
|---|---|
| count | 241.000000 |
| mean | 0.610415 |
| std | 0.468295 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 0.940000 |
| 75% | 0.980000 |
| max | 1.050000 |
time_series.columns
Index(['hour', 'Biogas_yield', 'Biogas_yield_FU'], dtype='object')
#fig = px.line(daily, x=time_series.hour, y=time_series.Biogas_yield_FU, title='Daily Biogas production')
fig = px.line(time_series, x=time_series.hour, y=time_series.Biogas_yield, title='Daily Biogas production')
fig.update_layout(xaxis_title="Date", yaxis_title="Daily biogas yield (m3/day)", width=1000)
fig.show()
data = time_series.drop(['hour', 'Biogas_yield_FU'], axis=1)
data
| Biogas_yield | |
|---|---|
| 0 | 334.938475 |
| 1 | 0.000000 |
| 2 | 0.000000 |
| 3 | 0.000000 |
| 4 | 0.000000 |
| ... | ... |
| 236 | 317.000000 |
| 237 | 316.000000 |
| 238 | 323.000000 |
| 239 | 301.000000 |
| 240 | 307.000000 |
241 rows × 1 columns
# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
"""Model data by finding best fit distribution to data"""
# Get histogram of original data
y, x = np.histogram(data, bins=bins, density=True)
x = (x + np.roll(x, -1))[:-1] / 2.0
# Distributions to check
DISTRIBUTIONS = [
st.dgamma,st.dweibull,st.expon,st.exponnorm,st.gamma,st.lognorm,st.norm,st.t,st.triang,
st.uniform
]
# Best holders
best_distribution = st.norm
best_params = (0.0, 1.0)
best_sse = np.inf
# Estimate distribution parameters from data
for distribution in DISTRIBUTIONS:
# Try to fit the distribution
try:
# Ignore warnings from data that can't be fit
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
# fit dist to data
params = distribution.fit(data)
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Calculate fitted PDF and error with fit in distribution
pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
sse = np.sum(np.power(y - pdf, 2.0))
# if axis pass in add to plot
try:
if ax:
pd.Series(pdf, x).plot(ax=ax)
end
except Exception:
pass
# identify if this distribution is better
if best_sse > sse > 0:
best_distribution = distribution
best_params = params
best_sse = sse
except Exception:
pass
return (best_distribution.name, best_params)
def make_pdf(dist, params, size=10000):
"""Generate distributions's Probability Distribution Function """
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Get sane start and end points of distribution
start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)
# Build PDF and turn into pandas Series
x = np.linspace(start, end, size)
y = dist.pdf(x, loc=loc, scale=scale, *arg)
pdf = pd.Series(y, x)
return pdf
# Plot for comparison
plt.figure(figsize=(20,15))
ax = data.plot(kind='hist', bins=200, density=True, alpha=0.5, color = [color['color'] for color in list(mpl.rcParams['axes.prop_cycle'])])
# Save plot limits
dataYLim = ax.get_ylim()
# Find best fit distribution
best_fit_name, best_fit_params = best_fit_distribution(data, 200, ax)
best_dist = getattr(st, best_fit_name)
# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'Biogas yield.\n Vanheede ')
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
# Make PDF with best params
pdf = make_pdf(best_dist, best_fit_params)
# Display
plt.figure(figsize=(20,15))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)
param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fit_params)])
dist_str = '{}({})'.format(best_fit_name, param_str)
ax.set_title(u'Biogas yield. with best fit distribution \n' + dist_str)
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
Text(0, 0.5, 'Frequency')
<Figure size 1440x1080 with 0 Axes>